library(C50)
library(tidyverse)
library(tidymodels)
library(janitor)
library(skimr)
library(kableExtra)
library(GGally)
library(vip)
library(fastshap)
library(MASS)
library(rpart.plot)
library(factoextra)
library(imputeMissings)
library(ISLR)
library(tree)
library(clock)
library(ggplot2)
library(dplyr)
library(lubridate)
library(corrplot)
donor<-read_csv("C:/Users/yuxia/Downloads/DonorMerge_Final (1).csv")%>%clean_names()%>%
mutate(days_from_now = as.numeric(Sys.Date()-mdy(date_posted)))%>%
dplyr::select(-date_posted)
donation<-read_csv("C:/Users/yuxia/Downloads/Donations (1).csv")%>%clean_names()
##data preparation
# To find the missing values
donor%>%select_if(is.factor)%>%skim()
donor%>%select_if(is.numeric)%>%skim()
| Name | Piped data |
| Number of rows | 328018 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| school_ncesid | 21067 | 0.94 | 2.606752e+11 | 1.590522e+11 | 1.00005e+10 | 9.0495e+10 | 2.6100e+11 | 3.7000e+11 | 6.100000e+11 | ▇▃▆▅▁ |
| school_latitude | 0 | 1.00 | 3.724000e+01 | 4.660000e+00 | 1.82500e+01 | 3.4040e+01 | 3.7670e+01 | 4.0760e+01 | 6.726000e+01 | ▁▇▇▁▁ |
| school_longitude | 0 | 1.00 | -9.344000e+01 | 1.781000e+01 | -1.71690e+02 | -1.1205e+02 | -8.7690e+01 | -7.9150e+01 | -6.663000e+01 | ▁▁▅▅▇ |
| school_zip | 2 | 1.00 | 5.310286e+04 | 3.123205e+04 | 4.10000e+02 | 2.7589e+04 | 5.3089e+04 | 8.5225e+04 | 9.995000e+04 | ▆▇▂▆▇ |
| great_messages_proportion | 89712 | 0.73 | 5.482000e+01 | 3.507000e+01 | 0.00000e+00 | 3.3000e+01 | 5.7000e+01 | 8.4000e+01 | 1.000000e+02 | ▆▃▆▆▇ |
| teacher_referred_count | 47338 | 0.86 | 9.300000e-01 | 2.350000e+00 | 0.00000e+00 | 0.0000e+00 | 0.0000e+00 | 1.0000e+00 | 1.250000e+02 | ▇▁▁▁▁ |
| non_teacher_referred_count | 47338 | 0.86 | 4.580000e+00 | 5.910000e+00 | 0.00000e+00 | 1.0000e+00 | 3.0000e+00 | 5.0000e+00 | 3.040000e+02 | ▇▁▁▁▁ |
| fulfillment_labor_materials | 17492 | 0.95 | 2.762000e+01 | 8.810000e+00 | 9.00000e+00 | 1.7000e+01 | 3.0000e+01 | 3.5000e+01 | 3.500000e+01 | ▁▂▁▁▇ |
| total_price_excluding_optional_s | 0 | 1.00 | 5.510900e+02 | 1.801938e+04 | 0.00000e+00 | 2.6817e+02 | 4.1058e+02 | 5.7910e+02 | 1.025002e+07 | ▇▁▁▁▁ |
| total_price_including_optional_s | 0 | 1.00 | 6.561900e+02 | 2.197364e+04 | 0.00000e+00 | 3.1902e+02 | 4.8750e+02 | 6.8816e+02 | 1.250002e+07 | ▇▁▁▁▁ |
| students_reached | 82 | 1.00 | 9.960000e+01 | 2.733940e+03 | 0.00000e+00 | 2.2000e+01 | 3.0000e+01 | 1.0000e+02 | 9.999990e+05 | ▇▁▁▁▁ |
| days_from_now | 0 | 1.00 | 4.315460e+03 | 7.992100e+02 | 3.26400e+03 | 3.6870e+03 | 4.1530e+03 | 4.8120e+03 | 7.391000e+03 | ▇▆▂▁▁ |
donation%>%select_if(is.factor)%>%skim()
donation%>%select_if(is.numeric)%>%skim()
| Name | Piped data |
| Number of rows | 1048575 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| donor_zip | 580554 | 0.45 | 50667.01 | 33285.41 | 0.00 | 20003.0 | 46902.00 | 89135.00 | 99999 | ▇▆▂▅▇ |
| donation_to_project | 8 | 1.00 | 65.75 | 215.43 | -11.80 | 10.0 | 21.25 | 50.00 | 85000 | ▇▁▁▁▁ |
| donation_optional_support | 8 | 1.00 | 10.70 | 32.97 | -0.02 | 1.5 | 3.75 | 7.50 | 15000 | ▇▁▁▁▁ |
| donation_total | 8 | 1.00 | 76.45 | 243.75 | -11.80 | 10.0 | 25.00 | 56.47 | 100000 | ▇▁▁▁▁ |
donor %>%
count(is_exciting) %>%
mutate(pct = n/sum(n))
donormerge <- donor%>%
filter(total_price_excluding_optional_s <1000,total_price_including_optional_s<1000,students_reached <250,teacher_referred_count<5)
boxplot<-function(m){
ggplot(donormerge,aes(x=!!as.name(m),y=as.factor(is_exciting),fill=as.factor(is_exciting)))+
geom_boxplot()+
labs(title = as.character(m),y='exciting')}
numerics <- c('school_latitude','school_longitude','great_messages_proportion','total_price_excluding_optional_s','total_price_including_optional_s','students_reached','days_from_now','teacher_referred_count')
for (c in numerics){print(boxplot(c))}
## Warning: Removed 35086 rows containing non-finite values (stat_boxplot).
we can see from the graph that exciting projects have a much higher
proportion of unique comments on a page.
char_fill <- function(col){
donor %>%
na.omit()%>%
ggplot(aes(!!as.name(col),fill=as.factor(is_exciting)))+
geom_bar(position = 'fill')+
coord_flip()+
labs(y='proportion')
}
dummy <- c('school_city','school_state','school_metro','school_district','school_county','teacher_prefix','primary_focus_subject','primary_focus_area','secondary_focus_subject','secondary_focus_area','resource_type','poverty_level','grade_level','is_exciting','one_non_teacher_referred_donor_g','school_charter','school_magnet','school_year_round','school_nlns','school_kipp','school_charter_ready_promise','teacher_teach_for_america','teacher_ny_teaching_fellow','eligible_double_your_impact_matc','eligible_almost_home_match')
for (column in dummy){print(char_fill(column))}
## explore the numeric variables (donation)
donation %>% filter(donation_to_project<200)%>%
ggplot(aes(x=donation_to_project))+ geom_histogram(binwidth=10)
donation %>% filter(donation_optional_support<200)%>%
ggplot(aes(x=donation_optional_support))+ geom_histogram(binwidth=10)
donation %>% filter(donation_total<200)%>%
ggplot(aes(x=donation_total))+ geom_histogram(binwidth=10)
## explore character variables
bar <- function(col){
donation %>%
na.omit()%>%
ggplot(aes(!!as.name(col)))+
geom_bar()+
coord_flip()+
labs(y='count')
}
dummy <- c('is_teacher_acct','dollar_amount','donation_included_optional_support','payment_method','payment_included_acct_credit','payment_included_campaign_gift_card','payment_included_web_purchased_gift_card','payment_was_promo_matched','via_giving_page','for_honoree')
for (column in dummy){print(bar(column))}
we can predict from these chart that: 1. some variables has relatively
low correlation with the donors: donation was made for an
honoree,donation was matched 1-1 with corporate funds,a portion of a
donation included corporate sponsored gift card high: donor is a
teacher;used accounts credit redemption;included corporate sponsored
gift card;donation was given through a giving / campaign page ## Prepare
for clustering
# Remove redundant variables and target variable
donation%>%dplyr::select(-donationid,-projectid,-donation_timestamp,-donor_acctid,-donor_city,-donor_zip,-donor_state)->cluster
# Create dummy variables
cluster$is_under_10 <- as.factor(ifelse(cluster$dollar_amount == 'under_10', 1, 0))
cluster$is_10_to_100 <- as.factor(ifelse(cluster$dollar_amount == '10_to_100', 1, 0))
cluster$is_100_and_upn <- as.factor(ifelse(cluster$dollar_amount == '100_and_up', 1, 0))
cluster$no_cash_received <- as.factor(ifelse(cluster$payment_method == 'no_cash_received', 1, 0))
cluster$paypal <- as.factor(ifelse(cluster$payment_method == 'paypal', 1, 0))
cluster$creditcard <- as.factor(ifelse(cluster$payment_method == 'creditcard', 1, 0))
cluster$amazon <- as.factor(ifelse(cluster$payment_method == 'amazon', 1, 0))
cluster$double_your_impact_match <- as.factor(ifelse(cluster$payment_method == 'double_your_impact_match', 1, 0))
cluster$promo_code_match <- as.factor(ifelse(cluster$payment_method == 'promo_code_match', 1, 0))
cluster$check <- as.factor(ifelse(cluster$payment_method == 'check', 1, 0))
cluster$almost_home_match <- as.factor(ifelse(cluster$payment_method == 'almost_home_match', 1, 0))
cluster$is_teacher_acct <- as.factor(ifelse(cluster$is_teacher_acct == 'TRUE', 1, 0))
cluster$is_donation_included_optional_support <- as.factor(ifelse(cluster$donation_included_optional_support == 'TRUE', 1, 0))
cluster$is_payment_included_acct_credit <- as.factor(ifelse(cluster$payment_included_acct_credit == 'TRUE', 1, 0))
cluster$is_payment_included_campaign_gift_card <- as.factor(ifelse(cluster$payment_included_campaign_gift_card == 'TRUE', 1, 0))
cluster$is_payment_included_web_purchased_gift_card <- as.factor(ifelse(cluster$payment_included_web_purchased_gift_card == 'TRUE', 1, 0))
cluster$is_payment_was_promo_matched <- as.factor(ifelse(cluster$payment_was_promo_matched == 'TRUE', 1, 0))
cluster$is_via_giving_page <- as.factor(ifelse(cluster$via_giving_page == 'TRUE', 1, 0))
cluster$is_for_honoree <- as.factor(ifelse(cluster$for_honoree == 'TRUE', 1, 0))
# Remove rejected variables
cluster%>%dplyr::select(-dollar_amount,-payment_method,-donation_message,-is_teacher_acct,-donation_included_optional_support,-payment_included_acct_credit,-payment_included_web_purchased_gift_card,-payment_was_promo_matched,-via_giving_page,-for_honoree,-payment_included_campaign_gift_card,-payment_included_campaign_gift_card,-payment_included_campaign_gift_card)->cluster
# Standardize numeric variables
for(col in colnames(cluster%>% select_if(is.numeric))){
cluster[, ncol(cluster) + 1] <- scale(cluster[col])
names(cluster)[ncol(cluster)] <- paste0(col,'_s')
cluster<-cluster%>%dplyr::select(-col)
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(col)` instead of `col` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
cluster%>%mutate_if(is.factor,as.character)%>%mutate_if(is.character,as.numeric)->cluster
head(cluster)
# Choose number of clusters
cluster_sample <- cluster %>%
sample_n(5000)
fviz_nbclust(cluster_sample,kmeans, method="wss")
set.seed(30)
clusters <- kmeans(cluster_sample,5,iter.max = 500,nstart = 20)
print(clusters)
## K-means clustering with 5 clusters of sizes 1933, 183, 22, 2068, 794
##
## Cluster means:
## is_under_10 is_10_to_100 is_100_and_upn no_cash_received paypal
## 1 0.1350233 0.8649767 0.000000000 0.0000000 0.187790998
## 2 0.0000000 0.0000000 1.000000000 0.5409836 0.027322404
## 3 0.0000000 0.0000000 1.000000000 0.6818182 0.000000000
## 4 0.1939072 0.8041586 0.001934236 0.9956480 0.003868472
## 5 0.0000000 0.0000000 1.000000000 0.2959698 0.050377834
## creditcard amazon double_your_impact_match promo_code_match check
## 1 0.5690636 0.058975685 0.01034661 0.16761511 0.005690636
## 2 0.1967213 0.000000000 0.10382514 0.00000000 0.071038251
## 3 0.0000000 0.000000000 0.04545455 0.00000000 0.181818182
## 4 0.0000000 0.000483559 0.00000000 0.00000000 0.000000000
## 5 0.3438287 0.027707809 0.17380353 0.05793451 0.022670025
## almost_home_match is_donation_included_optional_support
## 1 0.0005173306 0.9146405
## 2 0.0601092896 0.9836066
## 3 0.0909090909 0.9545455
## 4 0.0000000000 0.8984526
## 5 0.0277078086 0.9030227
## is_payment_included_acct_credit is_payment_included_campaign_gift_card
## 1 0.01345059 0.01810657
## 2 0.38251366 0.07650273
## 3 0.40909091 0.13636364
## 4 0.15764023 0.75870406
## 5 0.13853904 0.12846348
## is_payment_included_web_purchased_gift_card is_payment_was_promo_matched
## 1 0.002069322 0.18986032
## 2 0.098360656 0.00000000
## 3 0.136363636 0.00000000
## 4 0.091392650 0.00000000
## 5 0.051637280 0.05541562
## is_via_giving_page is_for_honoree donation_to_project_s
## 1 0.4899121 0.019141231 -0.1878061
## 2 0.3442623 0.027322404 1.9389363
## 3 0.4090909 0.000000000 7.0112678
## 4 0.1542553 0.007736944 -0.2226170
## 5 0.3916877 0.039042821 0.3786486
## donation_optional_support_s donation_total_s
## 1 -0.1980801 -0.1926735
## 2 2.2576854 2.0190359
## 3 7.5122981 7.2127789
## 4 -0.2388145 -0.2290560
## 5 0.3874131 0.3870553
##
## Clustering vector:
## [1] 1 1 4 5 4 1 1 5 1 4 1 4 5 4 2 5 4 5 1 5 1 5 4 1 4 5 4 1 4 1 4 4 1 4 4 4 1
## [38] 4 1 1 5 2 1 4 4 4 4 4 1 4 4 1 4 4 1 5 4 4 1 4 2 4 1 4 5 1 1 4 4 1 1 1 1 4
## [75] 4 4 1 1 4 1 4 4 4 4 1 4 4 4 1 4 5 4 1 4 1 5 1 1 1 4 1 2 4 4 4 1 4 1 1 1 4
## [112] 1 5 1 2 1 4 1 4 5 1 4 1 5 1 1 1 1 4 4 5 4 5 5 2 1 4 4 4 1 1 1 4 1 2 5 1 4
## [149] 4 2 4 4 1 1 5 4 4 4 1 4 4 4 5 4 5 4 4 4 1 1 1 4 4 1 4 1 4 1 5 1 1 1 5 2 4
## [186] 1 5 1 4 4 1 5 1 4 1 1 1 2 4 1 1 1 4 1 4 4 1 1 4 2 4 4 5 4 1 1 4 2 1 4 4 1
## [223] 4 4 1 4 5 4 5 1 4 4 1 1 1 1 4 4 4 1 4 4 1 1 4 1 5 5 5 4 4 4 1 4 4 1 1 1 1
## [260] 1 4 1 5 5 4 5 1 5 1 1 4 4 5 5 1 1 1 1 1 1 1 2 4 5 4 1 1 4 4 5 1 4 1 4 1 5
## [297] 4 4 1 4 4 5 1 1 1 5 1 1 1 4 4 4 2 4 1 1 1 4 1 5 4 4 1 4 4 4 4 1 4 1 4 4 4
## [334] 1 1 4 5 1 5 4 5 4 5 1 1 1 4 4 1 4 1 4 1 4 4 1 2 5 2 5 5 1 4 1 5 1 4 5 4 1
## [371] 4 4 4 4 4 4 1 4 2 1 5 5 4 1 4 4 4 1 4 5 5 5 4 1 4 1 1 1 4 4 1 1 5 2 4 4 1
## [408] 4 1 1 1 4 4 4 1 4 4 5 4 5 1 1 5 2 2 5 2 1 1 1 1 4 1 4 4 1 1 4 1 1 5 5 4 4
## [445] 4 4 5 4 1 4 1 1 1 4 1 1 1 4 4 4 1 4 1 4 1 4 1 4 1 2 5 4 1 5 5 1 4 4 1 4 4
## [482] 4 1 4 4 1 1 4 1 4 1 4 1 4 5 4 1 1 1 4 4 4 4 5 2 4 1 1 4 4 4 4 1 5 4 1 1 5
## [519] 4 2 5 1 1 4 1 4 4 4 4 1 1 4 4 4 1 5 1 1 5 1 4 4 5 4 1 5 1 1 4 1 1 1 4 5 1
## [556] 1 4 1 4 4 1 4 4 1 1 4 4 5 4 1 1 5 1 4 5 2 4 4 1 1 1 4 1 4 1 4 1 4 1 4 4 5
## [593] 4 4 1 5 1 5 1 5 4 1 4 1 4 4 4 5 4 4 1 5 1 1 4 1 4 1 1 1 1 1 4 1 5 5 1 4 4
## [630] 1 1 1 1 1 5 1 4 5 1 4 1 1 1 5 5 4 4 4 4 1 1 4 2 2 4 1 4 1 5 1 4 4 1 4 1 1
## [667] 1 1 1 2 4 4 2 1 5 5 1 1 1 1 1 1 5 4 4 1 5 5 1 4 4 5 1 1 4 1 1 4 4 4 5 1 1
## [704] 1 4 4 4 1 1 1 1 1 4 4 1 1 1 1 1 1 4 4 4 5 4 1 5 4 4 2 1 4 4 4 1 4 4 1 1 1
## [741] 1 1 1 5 4 4 4 1 4 1 5 4 1 5 4 4 5 4 4 4 5 5 2 4 1 5 1 1 4 1 4 4 1 4 1 1 5
## [778] 4 1 1 5 4 1 4 4 1 1 4 4 1 1 4 5 5 4 4 4 4 2 5 1 4 4 4 4 5 1 1 4 4 1 4 4 4
## [815] 4 4 1 1 4 1 1 4 4 5 4 1 1 4 4 1 4 1 4 1 4 4 1 1 1 4 5 5 1 4 1 5 5 1 5 4 1
## [852] 1 4 4 1 4 5 1 4 5 4 4 1 4 4 2 5 1 1 4 4 4 4 4 4 4 4 4 5 1 5 4 1 1 4 5 4 4
## [889] 1 1 4 1 5 4 1 4 5 4 4 4 1 4 4 4 5 5 1 4 4 5 4 4 4 4 2 5 5 1 4 4 4 1 5 5 1
## [926] 1 1 1 4 1 5 4 1 5 1 1 5 4 4 5 2 1 1 4 1 4 4 4 1 4 5 4 1 1 4 5 4 4 4 1 4 4
## [963] 5 4 1 4 4 4 1 5 5 4 4 4 4 1 1 4 4 1 4 4 1 1 5 4 1 1 1 4 4 1 5 4 1 1 5 5 4
## [1000] 1 4 1 1 4 4 4 5 4 4 4 4 2 1 1 5 4 5 4 5 4 5 1 4 4 1 4 4 4 4 1 4 1 1 1 1 4
## [1037] 1 1 1 4 4 1 1 4 4 4 4 4 1 1 4 5 1 5 4 4 2 4 1 4 1 1 4 4 5 1 1 5 5 4 4 4 1
## [1074] 1 4 1 2 5 5 4 4 4 1 4 5 4 4 1 1 1 1 1 2 1 4 1 4 2 4 4 4 1 1 5 4 1 1 4 5 1
## [1111] 1 1 1 1 1 4 4 4 1 5 1 4 5 1 2 4 1 1 4 1 4 5 4 1 4 2 5 5 1 4 1 1 4 4 1 4 4
## [1148] 4 4 5 1 4 4 1 4 2 4 5 1 1 4 4 4 4 1 5 4 1 1 2 4 1 5 4 1 1 5 4 1 2 1 4 1 1
## [1185] 1 4 1 1 4 4 5 5 5 4 1 5 4 1 4 4 5 4 5 1 4 4 1 1 4 4 2 1 4 1 4 4 1 1 4 4 1
## [1222] 4 5 1 5 1 1 5 4 4 5 5 4 1 1 4 1 4 1 1 1 4 4 1 5 4 1 1 1 1 4 1 1 4 4 4 4 1
## [1259] 4 4 1 4 4 4 1 4 1 4 4 5 4 1 4 1 4 5 4 5 1 4 4 4 4 1 1 4 1 1 1 4 5 1 5 1 4
## [1296] 1 4 4 5 1 4 4 1 1 4 4 1 1 4 4 4 4 1 5 1 4 4 1 4 2 1 1 2 4 1 4 1 4 4 5 1 1
## [1333] 5 1 4 4 4 4 4 1 1 1 4 1 4 4 1 5 1 4 4 1 1 4 1 1 4 5 4 1 4 4 4 4 5 1 4 2 1
## [1370] 5 1 4 1 4 1 5 1 4 1 4 1 1 5 5 4 4 4 5 1 4 1 1 5 4 4 4 1 5 4 1 1 1 1 4 1 1
## [1407] 1 4 4 4 1 5 4 1 1 1 4 1 1 4 4 4 4 5 4 4 1 1 4 1 4 1 5 1 4 4 4 1 1 5 2 4 1
## [1444] 1 1 1 5 4 1 1 1 4 1 4 5 1 1 5 4 1 4 1 4 1 4 1 4 1 1 5 1 1 5 5 5 1 4 4 4 4
## [1481] 2 4 1 1 4 4 1 4 2 5 1 5 5 2 1 4 5 4 4 1 1 4 1 1 1 1 2 1 4 4 5 1 5 1 1 1 1
## [1518] 4 2 1 1 4 5 1 2 4 1 5 1 4 4 1 5 5 4 4 4 1 1 4 1 5 1 1 1 1 5 4 1 5 5 1 3 1
## [1555] 1 1 4 4 1 5 4 5 4 4 1 4 4 1 5 1 5 4 4 1 1 1 4 2 1 5 4 2 1 4 5 2 2 4 4 1 1
## [1592] 3 4 4 4 1 5 1 4 4 4 1 1 4 1 4 4 4 1 4 4 1 4 2 4 5 4 4 4 2 1 4 4 1 1 4 4 4
## [1629] 1 4 1 4 4 1 1 5 1 1 5 4 4 5 1 4 4 5 1 4 4 1 1 2 1 1 4 5 1 5 4 4 4 1 5 1 4
## [1666] 1 1 5 4 4 4 1 5 4 5 4 2 5 1 5 4 5 1 1 4 4 5 5 1 1 5 4 4 1 4 1 5 5 4 4 4 4
## [1703] 4 4 1 1 5 1 5 2 4 1 1 1 1 4 4 4 5 4 5 2 4 4 5 1 4 4 5 5 5 4 1 4 4 4 1 4 1
## [1740] 4 4 1 1 2 2 1 1 1 1 5 5 4 4 4 1 1 1 4 4 1 4 4 5 3 4 1 1 1 1 2 4 1 1 4 4 5
## [1777] 1 5 1 1 4 4 4 4 4 1 4 5 1 4 5 1 4 1 4 1 1 4 4 1 1 4 4 1 1 1 4 1 1 1 1 1 1
## [1814] 1 4 4 1 2 1 1 1 1 4 5 4 4 4 4 4 1 2 5 4 4 4 4 4 1 4 1 1 4 2 4 5 4 4 1 4 1
## [1851] 1 5 1 1 1 1 4 5 1 1 4 4 1 1 1 4 1 1 5 1 4 1 1 4 1 4 1 5 1 4 4 1 1 4 1 5 1
## [1888] 4 4 5 5 5 4 5 1 4 1 4 1 4 4 1 1 1 1 1 4 1 4 5 5 1 5 4 4 1 4 4 4 5 4 5 5 4
## [1925] 1 1 4 5 4 1 4 1 1 4 4 1 5 1 1 4 4 4 4 1 4 4 1 1 1 4 4 5 4 4 1 5 4 1 4 5 5
## [1962] 1 5 3 5 1 4 4 1 4 1 1 1 1 4 1 1 1 4 4 4 5 1 4 4 4 5 4 5 1 1 4 4 5 4 1 4 1
## [1999] 4 1 1 5 4 1 1 4 4 5 4 1 4 1 1 5 1 1 4 5 1 1 4 2 2 4 2 5 4 1 5 1 1 5 5 4 1
## [2036] 1 4 1 5 4 4 4 1 5 4 3 4 4 4 4 4 5 1 3 4 1 5 5 5 5 5 1 4 4 5 1 1 5 5 1 1 4
## [2073] 1 4 4 1 4 1 1 1 3 4 1 1 1 1 4 4 5 4 5 5 5 5 4 1 4 5 4 4 1 1 1 5 4 4 4 4 4
## [2110] 4 4 1 5 4 2 4 1 1 4 1 1 4 4 4 5 5 5 4 4 2 5 4 5 4 1 4 1 1 4 4 1 4 4 1 4 1
## [2147] 1 5 5 2 4 1 1 4 4 4 4 4 1 5 1 1 4 4 1 4 1 1 1 1 5 4 4 4 1 5 4 1 1 2 1 2 1
## [2184] 4 4 4 4 4 1 4 5 1 5 1 4 1 1 4 1 1 1 2 5 1 1 4 1 1 4 1 4 4 4 1 5 4 1 4 1 4
## [2221] 4 1 4 1 4 4 5 4 1 4 4 4 4 5 1 1 5 1 1 1 1 2 1 4 4 1 4 2 1 1 4 4 4 1 4 1 4
## [2258] 5 4 1 1 1 2 5 5 1 1 2 4 4 5 4 1 1 1 1 4 4 1 1 5 1 4 5 4 1 1 5 4 5 1 4 2 4
## [2295] 1 1 4 4 4 5 4 1 4 2 1 5 1 1 1 4 4 1 4 4 4 4 1 4 1 1 4 1 1 1 4 4 5 1 4 5 4
## [2332] 4 4 1 5 5 4 1 5 1 1 5 1 5 4 5 4 5 5 1 1 2 4 1 5 1 1 4 4 4 4 2 1 4 4 1 5 4
## [2369] 4 1 4 4 1 5 1 1 2 5 1 1 4 4 4 1 1 4 5 4 1 5 5 1 2 1 5 4 4 1 1 1 4 4 4 4 5
## [2406] 5 4 1 1 1 4 4 4 4 4 1 5 4 4 4 1 4 1 2 1 4 1 1 4 5 1 4 5 1 1 1 4 1 1 4 4 1
## [2443] 4 5 4 4 5 4 4 4 4 4 1 5 1 4 1 2 4 1 4 4 2 4 1 1 5 4 5 4 1 4 5 4 5 5 1 4 1
## [2480] 1 5 1 5 1 5 5 5 1 4 4 1 1 1 4 1 1 4 1 4 4 4 4 2 1 1 1 4 1 1 5 1 1 4 1 5 5
## [2517] 4 3 1 4 4 5 1 4 4 5 4 5 1 1 4 4 4 4 4 1 1 4 1 1 4 4 2 1 5 4 1 5 5 1 1 1 1
## [2554] 1 5 1 1 4 4 4 1 4 1 4 5 4 1 1 1 4 4 4 4 4 5 5 4 4 4 1 1 1 4 4 4 4 4 1 1 1
## [2591] 1 1 1 1 4 4 4 1 1 4 4 4 4 4 4 4 5 4 4 4 4 4 4 1 1 1 4 2 4 5 4 1 4 1 5 5 1
## [2628] 1 4 4 1 2 4 1 4 1 5 4 4 1 2 4 4 1 4 1 4 1 4 4 1 5 4 1 5 4 1 1 1 4 5 3 1 5
## [2665] 4 5 1 4 1 4 1 1 4 4 5 4 5 1 4 4 1 1 1 5 4 5 1 5 4 4 1 4 5 1 1 5 4 4 1 4 4
## [2702] 4 4 1 5 4 4 4 1 5 5 4 1 5 1 5 1 2 4 4 5 1 5 1 4 4 4 5 5 4 4 4 1 4 1 4 4 4
## [2739] 4 5 1 1 4 4 4 5 1 4 1 4 4 1 1 4 1 1 4 4 4 1 1 5 1 4 1 1 1 1 1 4 1 5 5 4 1
## [2776] 4 5 1 4 1 1 4 4 4 1 4 4 1 4 4 1 1 5 2 1 4 4 2 5 4 4 5 4 4 3 5 4 4 5 4 5 4
## [2813] 1 4 4 1 1 1 4 4 1 5 1 4 1 4 1 4 1 1 4 5 5 5 1 4 1 1 4 1 4 4 1 4 1 4 1 1 1
## [2850] 1 4 4 4 5 1 5 1 4 4 1 1 5 1 4 1 1 1 5 1 4 4 4 1 1 5 1 4 1 5 4 5 1 4 4 4 4
## [2887] 4 1 1 1 2 1 1 4 1 1 5 4 4 1 4 1 5 2 1 4 5 4 1 1 1 1 4 5 1 1 4 1 4 4 4 2 4
## [2924] 1 1 4 1 5 1 1 4 4 5 4 1 4 1 4 4 1 1 4 4 4 1 5 4 1 4 4 1 1 5 5 1 5 1 4 1 4
## [2961] 1 4 1 5 1 5 4 1 5 4 4 5 5 5 5 5 1 1 4 1 4 4 4 1 5 1 4 1 5 1 2 1 4 4 1 1 4
## [2998] 4 4 4 1 1 1 5 1 2 3 1 1 5 4 4 4 1 4 4 4 4 4 2 4 5 1 4 5 4 1 1 5 4 1 4 4 1
## [3035] 1 4 1 4 4 4 1 1 5 1 4 4 4 4 1 1 5 4 4 4 1 4 2 4 5 5 1 1 5 1 5 4 5 4 2 1 5
## [3072] 4 1 1 1 4 5 1 1 5 1 4 1 1 1 4 5 5 4 4 4 1 1 5 5 1 1 1 1 4 4 4 4 5 5 1 4 5
## [3109] 1 4 4 1 1 1 1 1 1 1 1 1 1 4 1 4 4 1 5 1 4 5 1 1 4 1 1 1 1 1 1 2 1 5 1 1 1
## [3146] 4 4 1 4 4 1 1 4 4 1 1 4 5 4 1 4 1 1 4 1 2 2 1 4 5 1 4 4 2 1 4 5 1 1 1 4 4
## [3183] 4 1 2 4 5 4 4 1 4 1 1 1 1 1 4 5 2 1 4 1 1 4 1 4 4 4 1 4 1 4 1 1 1 4 1 4 5
## [3220] 4 4 4 1 4 1 5 4 1 2 4 1 1 5 1 2 4 4 5 1 1 1 1 4 1 5 1 1 1 4 2 1 4 1 5 5 1
## [3257] 4 1 2 2 1 4 1 5 1 4 4 1 2 4 1 4 5 1 4 5 5 1 5 5 4 1 1 1 1 5 1 4 1 2 4 4 4
## [3294] 5 1 4 4 4 4 1 1 4 4 1 4 4 5 4 4 4 1 5 3 1 4 4 4 4 4 5 4 4 1 4 1 1 1 4 1 4
## [3331] 1 4 2 1 1 1 1 4 5 5 4 1 4 4 4 4 5 4 2 4 1 4 1 1 5 4 1 1 4 5 4 1 4 1 4 4 1
## [3368] 5 4 1 4 5 4 4 4 5 1 5 1 1 4 4 5 4 2 1 4 1 4 1 2 1 5 1 1 2 4 4 5 1 3 1 1 4
## [3405] 1 1 1 4 2 5 5 4 1 5 4 4 5 1 5 1 5 1 4 1 1 4 1 1 4 4 4 4 1 1 5 5 4 4 4 4 4
## [3442] 4 4 1 4 1 1 1 1 4 4 4 5 4 4 1 5 4 4 4 5 4 4 4 5 1 4 1 2 1 4 1 4 4 4 2 1 1
## [3479] 4 1 4 4 4 1 1 1 2 1 4 4 1 1 1 2 1 4 4 4 4 5 1 4 4 1 4 1 4 4 1 4 4 1 1 4 4
## [3516] 4 4 1 5 1 4 1 5 4 1 1 4 4 1 5 4 5 4 1 1 1 2 4 4 4 1 1 1 4 4 4 1 4 1 4 4 5
## [3553] 1 1 5 5 4 5 1 4 4 4 5 4 1 5 4 4 1 5 4 1 4 5 1 5 4 4 4 2 4 1 4 1 4 1 5 4 4
## [3590] 1 1 4 1 5 1 1 4 4 1 1 1 4 2 5 1 1 1 1 1 4 1 1 4 1 5 4 4 4 1 1 1 4 1 1 4 1
## [3627] 1 4 4 4 1 1 5 4 1 4 4 4 4 1 4 5 1 5 1 4 1 5 1 4 1 4 1 1 4 1 5 1 1 5 4 1 5
## [3664] 1 1 4 2 4 4 4 1 4 5 5 1 4 4 5 5 1 4 5 4 1 4 1 1 4 1 3 1 4 4 4 4 1 1 1 1 4
## [3701] 2 1 4 5 1 1 1 5 1 1 4 4 4 4 1 4 4 1 1 4 4 1 4 4 5 4 5 2 1 4 4 5 1 4 4 1 4
## [3738] 4 4 4 1 1 5 1 1 4 1 1 1 4 4 4 1 1 1 4 4 1 2 4 1 5 1 4 1 4 4 4 1 4 1 1 1 1
## [3775] 4 1 4 1 4 4 1 1 4 4 1 1 5 1 4 4 4 1 5 4 5 5 5 1 1 4 1 4 5 1 1 5 5 1 5 1 1
## [3812] 4 5 1 5 4 5 1 4 1 1 4 4 5 1 4 4 5 4 1 1 2 1 1 1 1 1 1 4 1 4 1 4 4 5 1 4 4
## [3849] 1 5 1 5 1 1 4 1 4 4 5 4 1 4 1 5 1 1 4 4 4 1 4 4 4 5 4 1 4 5 2 1 4 1 1 4 4
## [3886] 4 2 4 4 4 4 1 4 4 4 1 1 5 1 4 4 4 5 5 1 5 1 4 5 4 1 4 5 1 4 5 4 4 1 5 4 4
## [3923] 1 5 5 4 4 1 1 1 5 4 4 4 4 1 1 4 4 4 1 5 1 1 1 1 4 5 2 1 1 1 4 1 4 4 4 1 5
## [3960] 1 4 1 4 1 5 4 4 5 4 1 5 4 5 4 2 4 4 4 1 1 4 4 1 1 4 1 4 1 1 4 5 1 4 5 4 1
## [3997] 1 1 5 1 4 4 1 2 1 4 1 4 5 1 4 4 1 4 4 4 1 4 1 3 1 1 4 5 4 4 4 1 1 4 4 1 5
## [4034] 1 1 1 4 4 4 4 4 5 4 4 1 4 4 4 1 4 5 1 4 1 1 1 4 4 1 4 4 1 4 1 1 1 1 4 1 2
## [4071] 4 4 1 5 4 4 1 1 4 4 4 1 1 5 4 4 4 4 5 4 1 1 1 2 4 1 4 1 4 4 1 4 4 1 1 4 5
## [4108] 1 4 1 4 1 1 1 4 1 4 4 5 4 5 4 5 1 4 1 5 4 1 1 5 4 1 1 4 4 4 1 1 2 4 4 4 4
## [4145] 5 4 1 1 4 4 1 4 4 5 5 1 1 1 5 1 4 4 1 4 4 4 4 4 5 1 4 4 1 4 1 1 1 4 4 4 4
## [4182] 2 2 4 1 5 5 4 1 4 4 5 2 4 5 5 1 1 4 5 4 1 5 4 4 4 4 4 4 5 4 4 5 1 4 4 1 1
## [4219] 1 4 1 2 4 4 4 4 2 1 1 1 5 3 5 1 4 1 4 5 4 5 1 4 4 4 4 4 1 4 1 5 1 5 1 1 1
## [4256] 4 1 4 1 5 4 4 2 5 1 4 1 2 1 5 4 4 1 1 1 1 5 4 5 2 5 5 1 4 5 1 1 1 4 5 1 1
## [4293] 4 1 1 1 1 4 4 4 4 4 4 1 1 4 4 5 5 4 4 1 4 1 1 1 4 5 1 4 5 1 5 4 4 4 1 4 1
## [4330] 5 4 1 5 4 1 4 5 2 1 4 1 2 1 1 5 4 1 4 4 4 1 3 5 4 1 1 1 4 1 4 4 5 4 1 1 4
## [4367] 5 4 4 5 4 5 1 5 1 1 4 4 4 1 5 1 4 4 4 2 5 4 5 5 4 4 5 1 4 4 5 1 1 1 4 1 1
## [4404] 4 1 5 4 1 5 5 1 4 5 1 1 5 1 4 1 1 4 1 4 1 4 1 5 1 4 1 2 4 4 4 4 1 4 1 4 4
## [4441] 5 4 1 1 5 4 5 4 1 4 1 1 4 1 4 5 4 5 4 1 1 5 1 1 1 4 1 4 5 5 2 4 5 4 4 4 1
## [4478] 4 4 5 1 1 1 4 1 4 1 5 1 4 1 5 1 1 4 1 4 4 4 4 4 4 4 1 4 4 4 4 4 5 5 5 4 5
## [4515] 1 1 1 1 1 4 4 1 1 1 1 1 1 1 1 4 4 1 4 1 1 2 5 5 5 3 4 4 5 1 4 2 2 1 4 1 4
## [4552] 4 1 1 1 1 1 4 4 5 2 4 4 1 1 1 4 4 4 1 4 4 1 5 4 4 4 1 4 4 5 1 1 4 4 1 4 1
## [4589] 4 1 1 4 1 1 4 5 5 4 4 2 3 4 4 5 4 5 4 4 5 4 1 4 5 5 1 1 1 5 4 1 1 4 4 5 4
## [4626] 1 5 1 4 4 4 5 4 1 1 4 4 4 1 1 4 5 1 5 5 1 4 1 4 4 4 1 4 4 1 5 5 4 4 5 4 1
## [4663] 4 4 4 1 1 1 1 1 4 5 4 1 1 4 1 4 5 1 5 5 1 4 1 1 1 4 4 4 4 5 4 1 5 4 1 4 1
## [4700] 4 5 1 1 1 4 4 4 1 4 1 1 1 5 1 4 4 1 5 4 5 1 4 5 1 1 4 1 2 1 1 4 1 1 4 1 5
## [4737] 4 4 4 1 4 1 1 4 4 1 5 5 4 1 5 4 4 1 4 4 5 2 3 1 1 5 4 4 4 5 4 5 1 1 1 1 4
## [4774] 1 5 1 1 3 1 5 4 4 4 4 4 5 2 4 5 4 1 1 4 5 4 2 1 5 5 5 5 1 2 4 4 4 1 5 5 1
## [4811] 4 1 4 1 1 4 4 5 4 4 1 2 5 1 1 1 4 1 1 1 1 5 5 5 1 4 4 2 2 4 4 4 5 2 4 4 4
## [4848] 4 4 1 4 1 4 4 4 4 1 1 1 4 5 1 2 4 2 5 5 1 1 1 5 1 2 1 5 1 1 1 5 5 1 4 1 1
## [4885] 4 4 1 4 1 1 1 4 1 1 4 4 1 1 5 4 4 4 5 1 4 1 5 1 4 1 1 4 4 1 2 4 4 1 4 2 1
## [4922] 1 4 4 2 1 4 4 4 1 4 4 1 1 4 4 4 1 1 4 4 1 1 5 4 5 4 5 1 3 1 5 4 4 1 1 1 4
## [4959] 1 4 1 4 1 1 1 4 5 1 1 4 1 1 4 1 1 2 5 4 5 1 4 4 4 4 4 4 1 4 5 1 1 4 4 5 5
## [4996] 4 5 1 4 4
##
## Within cluster sum of squares by cluster:
## [1] 2711.7616 599.2561 512.7381 2005.3851 1455.3231
## (between_SS / total_SS = 58.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualize clusters
fviz_cluster(clusters,cluster_sample,ellipse.type="norm",geom="point")
#determine which variables are driving the cluster creation
# profile clusters
cluster_sample$cluster<-as.factor(clusters$cluster)
for(col in colnames(cluster_sample%>% select_if(is.factor))){
cluster_sample%>% ggplot(aes(!!as.name(col)))+geom_bar()->p
print(p)
cluster_sample%>% ggplot(aes(!!as.name(col)))+geom_bar()+facet_wrap(~clusters$cluster)->p
print(p)}
for(col in colnames(cluster_sample%>% select_if(is.numeric))){
cluster_sample%>%
ggplot(aes(!!as.name(col)))+geom_histogram()->p
print(p)
cluster_sample%>%
ggplot(aes(!!as.name(col)))+geom_histogram()+ facet_wrap(~clusters$cluster)->p
print(p)}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
data <-donor %>% dplyr::select(-projectid,-school_latitude,-school_longitude,-teacher_acctid,-schoolid,-school_ncesid,-school_city,-school_state,-school_zip,-school_metro,-school_district,-school_county,-primary_focus_subject,-secondary_focus_area)%>%
mutate_if(is.character,factor)%>%
mutate(is_exciting= as.factor(is_exciting))
data$one_non_teacher_referred_donor_g <- as.factor(data$one_non_teacher_referred_donor_g)
data$school_charter <- as.factor(data$school_charter)
data$school_magnet <- as.factor(data$school_magnet)
data$school_year_round <- as.factor(data$school_year_round)
data$school_nlns <- as.factor(data$school_nlns)
data$school_kipp <- as.factor(data$school_kipp)
data$school_charter_ready_promise <- as.factor(data$school_charter_ready_promise)
data$teacher_teach_for_america<- as.factor(data$teacher_teach_for_america)
data$teacher_ny_teaching_fellow <- as.factor(data$teacher_ny_teaching_fellow)
data$eligible_double_your_impact_matc<- as.factor(data$eligible_double_your_impact_matc)
data$eligible_almost_home_match <- as.factor(data$eligible_almost_home_match)
head(data)
data_sample <- data %>% sample_n(100000)
set.seed(1200)
split <- initial_split(data_sample, prop = 0.75)
train <- training(split)
test <- testing(split)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(data_sample) * 100)
## [1] "Train PCT : 75.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(data_sample) * 100)
## [1] "Test PCT : 25.00%"
train
donor_recipe <- recipe(is_exciting ~.,data = train) %>%
step_impute_median(all_numeric_predictors()) %>% # missing values numeric
step_novel(all_nominal_predictors()) %>% # new factor levels
step_unknown(all_nominal_predictors()) %>% # missing values
step_other(all_nominal_predictors(),threshold = 0.01) %>% # pool rarely occuring levels
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_nzv(all_predictors())%>% prep()
# -- apply the recipe
bake_train <- bake(donor_recipe, new_data = train)
bake_test <- bake(donor_recipe, new_data = test)
log_model <-logistic_reg(mode = "classification") %>%
set_engine("glm") %>%
fit(is_exciting ~ ., data = bake_train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(log_model) %>%
mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4)
## new model with significant vars
log_model2 <-logistic_reg(mode = "classification") %>%
set_engine("glm") %>%
fit(is_exciting ~ great_messages_proportion + teacher_referred_count + non_teacher_referred_count + fulfillment_labor_materials + total_price_excluding_optional_s+total_price_including_optional_s+days_from_now+ teacher_teach_for_america_FALSE. +teacher_prefix_Mr.+ one_non_teacher_referred_donor_g_FALSE. + eligible_double_your_impact_matc_FALSE.+ teacher_prefix_Mrs.+resource_type_Technology+ eligible_almost_home_match_FALSE.+eligible_double_your_impact_matc_FALSE., data = bake_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(log_model2) %>%
mutate_at(c("estimate", "std.error", "statistic", "p.value"),round, 4)
# -- training predictions from new logistic model
predict(log_model2, bake_train, type = "prob") %>%
bind_cols(.,predict(log_model2, bake_train)) %>%
bind_cols(.,bake_train) -> scored_train_log_model2
head( scored_train_log_model2)
# -- testing predictions from new logistic model
predict(log_model2, bake_test, type = "prob") %>%
bind_cols(.,predict(log_model2, bake_test)) %>%
bind_cols(.,bake_test) -> scored_test_log_model2
head(scored_test_log_model2)
# -- AUC: Train and Test
scored_train_log_model2 %>%
metrics(is_exciting, .pred_FALSE, estimate = .pred_class) %>%
mutate(part="training") %>%
bind_rows( scored_test_log_model2 %>%
metrics(is_exciting, .pred_FALSE, estimate = .pred_class) %>%
mutate(part="testing")
)
# precision and recall
scored_train_log_model2 %>%
precision(is_exciting, .pred_class, event_level = 'second') %>%
mutate(part="training") %>%
bind_rows( scored_test_log_model2 %>%
precision(is_exciting, .pred_class, event_level = 'second') %>%
mutate(part="testing") ) %>% print()
## # A tibble: 2 × 4
## .metric .estimator .estimate part
## <chr> <chr> <dbl> <chr>
## 1 precision binary 0.718 training
## 2 precision binary 0.702 testing
scored_train_log_model2 %>%
recall(is_exciting, .pred_class) %>%
mutate(part="training") %>%
bind_rows( scored_test_log_model2 %>%
recall(is_exciting, .pred_class) %>%
mutate(part="testing") ) %>% print()
## # A tibble: 2 × 4
## .metric .estimator .estimate part
## <chr> <chr> <dbl> <chr>
## 1 recall binary 0.981 training
## 2 recall binary 0.979 testing
# -- ROC Charts
scored_train_log_model2 %>%
mutate(model = "train") %>%
bind_rows(scored_test_log_model2 %>%
mutate(model="test")) %>%
group_by(model) %>%
roc_curve(is_exciting, .pred_FALSE) %>%
autoplot()
## confusion matrix
scored_train_log_model2 %>%
conf_mat(is_exciting, .pred_class) %>%
autoplot( type = "heatmap") +
labs(title="Train Confusion Matrix")
scored_test_log_model2 %>%
conf_mat(is_exciting, .pred_class) %>%
autoplot( type = "heatmap") +
labs(title="Test Confusion Matrix")
log_model <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
log_workflow <- workflow() %>%
add_recipe(donor_recipe) %>%
add_model(log_model) %>%
fit(train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
log_workflow %>%
extract_fit_parsnip() %>%
tidy()%>%
mutate(across(is.numeric,round,3))
## Warning: Predicate functions must be wrapped in `where()`.
##
## # Bad
## data %>% select(is.numeric)
##
## # Good
## data %>% select(where(is.numeric))
##
## ℹ Please update your code.
## This message is displayed once per session.
rf_model <- rand_forest(trees=100, min_n=10) %>%
set_mode("classification") %>%
set_engine("ranger", importance="impurity")
rf_workflow <-workflow() %>%
add_recipe(donor_recipe) %>%
add_model(rf_model)%>%
fit(train)
# -- score testing
predict(rf_workflow, test, type="prob") %>%
bind_cols(predict(rf_workflow, test, type="class")) %>%
bind_cols(., test) -> scored_test
predict(rf_workflow, train, type="prob") %>%
bind_cols(predict(rf_workflow, train, type="class")) %>%
bind_cols(., train) -> scored_train
#evaluation
options(yardstick.event_first = FALSE)
test_score <-predict(rf_workflow, test, type="prob") %>%
bind_cols(predict(rf_workflow, test, type="class")) %>%
bind_cols(test)
# -- metrics --
test_score %>%
metrics(is_exciting, .pred_TRUE, estimate = .pred_class)
## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)` -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
# -- roc curve plot --
test_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
autoplot()
# -- calculate operating range --
test_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
mutate(fpr = round((1 - specificity),2),
tpr = round(sensitivity,3),
score_threshold = round(.threshold,3)) %>%
group_by(fpr) %>%
summarise(threshold = max(score_threshold),
tpr = max(tpr))%>%
filter(fpr >= 0.01 & fpr <= 0.10)
# -- roc curve at the FPR operating range --
test_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
autoplot() +
geom_vline(aes(xintercept=0.05, color="red")) +
labs(title="ROC operating at 5% FPR")
# -- calculate precision --
test_score %>%
mutate(.pred_class = if_else(.pred_TRUE >= 0.5,1,0)) %>%
mutate(.pred_class = as.factor(.pred_class))
# -- Confustion Matricies
test_score %>%
conf_mat(is_exciting, .pred_class) %>%
autoplot( type = "heatmap") +
labs(title="Test Confusion Matrix")
options(yardstick.event_first = FALSE)
train_score <-predict(rf_workflow, train, type="prob") %>%
bind_cols(predict(rf_workflow, train, type="class")) %>%
bind_cols(train)
# -- metrics --
train_score %>%
metrics(is_exciting, .pred_TRUE, estimate = .pred_class)
# -- roc curve plot --
train_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
autoplot()
# -- calculate operating range --
train_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
mutate(fpr = round((1 - specificity),2),
tpr = round(sensitivity,3),
score_threshold = round(.threshold,3)) %>%
group_by(fpr) %>%
summarise(threshold = max(score_threshold),
tpr = max(tpr))%>%
filter(fpr >= 0.01 & fpr <= 0.10)
# -- roc curve at the FPR operating range --
train_score %>%
roc_curve(is_exciting, .pred_TRUE) %>%
autoplot() +
geom_vline(aes(xintercept=0.05, color="red")) +
labs(title="ROC operating at 5% FPR")
# -- calculate precision --
train_score %>%
mutate(.pred_class = if_else(.pred_TRUE >= 0.5,1,0)) %>%
mutate(.pred_class = as.factor(.pred_class))
# -- Confustion Matricies
train_score %>%
conf_mat(is_exciting, .pred_class) %>%
autoplot( type = "heatmap") +
labs(title="Train Confusion Matrix")
## VIP
rf_workflow %>%
pull_workflow_fit()%>%
vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.